Prev Exercises: Udacity:DeepLearning:TensorFlow:notMNIST
notMNIST: This notebook uses the notMNIST dataset to be used with python experiments. This dataset is designed to look like the classic MNIST dataset, while looking a little more like real data: it's a harder task, and the data is a lot less 'clean' than MNIST.
import sys
print sys.version
from joblib import Parallel, delayed
import multiprocessing
nCores = multiprocessing.cpu_count() - 2 # Allow other apps to run
print 'nCores: %d' % (nCores)
from datetime import datetime, time
print 'now: %s' % str(datetime.now())
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import display, Image
import rpy2.robjects as robjects
from rpy2.robjects.packages import importr
from rpy2.robjects.lib import grid
from rpy2.robjects.lib import ggplot2
import rpy2.robjects.pandas2ri
import numpy as np
import os
import pandas as pd
from scipy import ndimage
from six.moves.urllib.request import urlretrieve
from six.moves import cPickle as pickle
from skimage import color as sk_color
from skimage import io as sk_io
from skimage import transform as sk_transform
import tarfile
%run img_utils.py
The specs should be in img_glbSpecs_SFDD
%run img_glbSpec_SFDD_ImgSz_64.py
#print 'glbDataFile: %s' % (glbDataFile)
print 'glbImg: %s' % (glbImg)
print 'glbRspClass: %s' % (glbRspClass)
print 'glbRspClassN: %d' % (glbRspClassN)
print 'glbPickleFile: %s' % (glbPickleFile)
# glbDataURL = 'http://yaroslavvb.com/upload/notMNIST/'
# glbImg['size'] = 32
First, we'll download the dataset to our local machine.
def maybe_download(url, filename, expected_bytes = None):
"""Download a file if not present, and make sure it's the right size."""
if not os.path.exists('data/' + filename):
filename, _ = urlretrieve(url + filename, filename)
statinfo = os.stat('data/' + filename)
verified = False
if (expected_bytes == None):
if (statinfo.st_size > 0):
verified = True
else:
if (statinfo.st_size == expected_bytes):
verified = True
if verified:
print('Found and verified', 'data/' + filename)
else:
raise Exception(
'Failed to verify' + filename + '. Can you get to it with a browser?')
return 'data/' + filename
dataFNm = maybe_download(glbDataFile['url'], glbDataFile['filename'])
# url = 'http://yaroslavvb.com/upload/notMNIST/'
# def maybe_download(url, filename, expected_bytes):
# """Download a file if not present, and make sure it's the right size."""
# if not os.path.exists(filename):
# filename, _ = urlretrieve(url + filename, filename)
# statinfo = os.stat(filename)
# if statinfo.st_size == expected_bytes:
# print('Found and verified', filename)
# else:
# raise Exception(
# 'Failed to verify' + filename + '. Can you get to it with a browser?')
# return filename
# train_filename = maybe_download('data/notMNIST_large.tar.gz', 247336696)
# test_filename = maybe_download('data/notMNIST_small.tar.gz', 8458043)
Extract the dataset from the compressed downloaded file(s).
def extract(filename, num_classes):
print("Figure out automatically if data needs to be extracted")
return
tar = tarfile.open(filename)
root = os.path.splitext(os.path.splitext(filename)[0])[0] # remove .tar.gz
print('Extracting data for %s. This may take a while. Please wait.' % root)
sys.stdout.flush()
tar.extractall()
tar.close()
# My edits: data_folders needs to be modified for the correct path
data_folders = [
os.path.join(root, d) for d in sorted(os.listdir(root)) if d != '.DS_Store']
if len(data_folders) != num_classes:
raise Exception(
'Expected %d folders, one per class. Found %d instead.' % (
num_classes, len(data_folders)))
print(data_folders)
return data_folders
if (glbDataFile['extract']):
train_folders = extract(os.getcwd() + train_filename, glbRspClassN)
test_folders = extract(os.getcwd() + test_filename , glbRspClassN)
driverDf = pd.read_csv('data/driver_imgs_list.csv')
print driverDf.describe()
# print driverDf.shape
print driverDf.head()
print driverDf.tail()
print '\n subject knts:'
print driverDf['subject'].value_counts().sort_values()
notMNINST:
Extraction give you a set of directories, labelled A through J.
The data consists of characters rendered in a variety of fonts on a 28x28 image. The labels are limited to 'A' through 'J' (10 classes). The training set has about 500k and the obsNewSet 19000 labelled examples. Given these sizes, it should be possible to train models quickly on any machine.
Let's take a peek at some of the data to make sure it looks sensible.
trnFoldersPth = os.getcwd() + '/data/' + glbDataFile['trnFoldersPth']
newFoldersPth = os.getcwd() + '/data/' + glbDataFile['newFoldersPth']
# print(trnFoldersPth)
# print(newFoldersPth)
Collect data corrections into glbDataScrub
def myreadImage(filePthNm):
img = sk_io.imread(filePthNm)
try:
assert img.shape == glbImg['shape'], 'img.shape: %s' % \
(img.shape)
assert np.min(img) >= 0, 'img.min: %.4f' % \
(np.min(img))
assert np.max(img) <= glbImg['pxlDepth'], 'img.min: %.4f' % \
(np.max(img))
except AssertionError, e:
print 'filePthNm: %s' % (filePthNm)
print e
raise
return(img)
# plt.imshow(myreadImage(trnFoldersPth + '/c0/img_15117.jpg'))
# plt.imshow(myreadImage(trnFoldersPth + '/c8/img_67168.jpg'))
# plt.imshow(myreadImage(trnFoldersPth + '/c9/img_84986.jpg'))
plt.imshow(myreadImage(trnFoldersPth + '/c9/img_89196.jpg'))
# plt.imshow(myreadImage(trnFoldersPth + '/c9/img_95888.jpg'))
smpClsImg = {}; smpN = 3
for cls in glbRspClass:
clsImg = {}
# print 'Class: %s' % (cls)
clsPth = trnFoldersPth + '/' + cls
onlyfiles = [f for f in os.listdir(clsPth)
if os.path.isfile(os.path.join(clsPth, f))]
for ix in np.random.randint(0, len(onlyfiles), size = smpN):
# print ' %s:' % (onlyfiles[ix])
# img = sk_io.imread(clsPth + '/' + onlyfiles[ix])
# assert img.shape == (480, 640, 3), 'img.shape: %s' % (img.shape)
# assert np.min(img) == 0, 'img.min: %.4f' % (np.min(img))
# assert np.max(img) == glbImg['pxlDepth'], 'img.min: %.4f' % (np.max(img))
clsImg[onlyfiles[ix]] = myreadImage(clsPth + '/' + onlyfiles[ix])
# jpgfile = Image(clsPth + '/' + onlyfiles[ix], format = 'jpg',
# width = glbImg['size'] * 4, height = glbImg['size'] * 4)
# display(jpgfile)
smpClsImg[cls] = clsImg
# print smpClsImg
figs, axes = plt.subplots(len(glbRspClass), smpN,
figsize=(5 * smpN, 4 * len(glbRspClass)))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
for i, cls in enumerate(smpClsImg.keys()):
for j, imgFileName in enumerate(smpClsImg[cls].keys()):
axes[i, j].imshow(smpClsImg[cls][imgFileName])
axes[i, j].set_title(cls + ':' + imgFileName)
smpSbtImg = {}; smpN = 3
for sbt in driverDf['subject'].values[
np.random.randint(0, len(driverDf['subject'].values),
size = smpN)]:
sbtImg = {}
# print ' subject: %s' % (sbt)
driverSbtDf = driverDf[driverDf['subject'] == sbt]
# print driverSbtDf.shape
clsPth = trnFoldersPth + '/' + cls
onlyfiles = [f for f in os.listdir(clsPth)
if os.path.isfile(os.path.join(clsPth, f))]
for cls in driverSbtDf['classname'].values[
np.random.randint(0, len(driverSbtDf['classname'].values),
size = smpN)]:
# print ' class: %s' % (cls)
# print " driverSbtDf[driverSbtDf['classname'] == cls]['img'].shape = %s" % \
# (driverSbtDf[driverSbtDf['classname'] == cls]['img'].shape)
imgFnm = driverSbtDf[driverSbtDf['classname'] == cls]['img'].iloc[0]
dctKey = cls + ':' + imgFnm
imgFnm = trnFoldersPth + '/' + cls + '/' + imgFnm
# img = sk_io.imread(imgFnm)
# assert img.shape == (480, 640, 3), 'img.shape: %s' % (img.shape)
sbtImg[dctKey] = myreadImage(imgFnm)
# jpgfile = Image(clsPth + '/' + onlyfiles[ix], format = 'jpg',
# width = glbImg['size'] * 4, height = glbImg['size'] * 4)
# display(jpgfile)
smpSbtImg[sbt] = sbtImg
# print smpClsImg
nRow = smpN; nCol = smpN
figs, axes = plt.subplots(nRow, nCol,
figsize=(6 * nCol, 6 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
for i, sbt in enumerate(smpSbtImg.keys()):
for j, imgDesc in enumerate(smpSbtImg[sbt].keys()):
axes[i, j].imshow(smpSbtImg[sbt][imgDesc])
axes[i, j].set_title(sbt + ':' + imgDesc)
def mytransformImage(raw, retVals = 'final'):
assert retVals in ['final', 'each'], \
'unsupported retVals option: %s' % (retVals)
prcImgDct = {'raw': raw, 'fnl': raw.astype(float)}
fnlShape = rawShape = raw.shape
# 'crop'
if ('crop' in glbImg.keys()):
xmin = 0; xmax = rawShape[1]
ymin = 0; ymax = rawShape[0]
if ('x' in glbImg['crop'].keys()):
xmin, xmax = glbImg['crop']['x']
if ('y' in glbImg['crop'].keys()):
ymin, ymax = glbImg['crop']['y']
if retVals == 'each':
prcImgDct['crp'] = sk_transform.resize(raw[ymin : ymax,
xmin : xmax],
rawShape)
prcImgDct['fnl'] = sk_transform.resize(
prcImgDct['fnl'][ymin : ymax, xmin : xmax],
rawShape)
# 'size'
# if not glbImg['color']:
# fnlShape = (glbImg['size'], glbImg['size'], 1)
# else:
# fnlShape = (glbImg['size'], glbImg['size'], rawShape[2])
fnlShape = (glbImg['size'], glbImg['size'], rawShape[2])
if (rawShape != fnlShape):
if retVals == 'each':
prcImgDct['sze'] = sk_transform.resize(raw, fnlShape)
prcImgDct['fnl'] = sk_transform.resize(prcImgDct['fnl'], fnlShape)
# 'color'
if not glbImg['color']:
if retVals == 'each':
prcImgDct['gry'] = sk_color.rgb2gray(raw)
prcImgDct['fnl'] = sk_color.rgb2gray(prcImgDct['fnl'])
# 'center_scale'
if glbImg['center_scale']:
if retVals == 'each':
prcImgDct['c_s'] = (raw.astype(float) - glbImg['pxlDepth'] / 2.0) / \
glbImg['pxlDepth']
prcImgDct['fnl'] = (prcImgDct['fnl'] - glbImg['pxlDepth'] / 2.0) / \
glbImg['pxlDepth']
if retVals == 'final':
return prcImgDct['fnl']
else:
return prcImgDct
sbt = smpSbtImg.keys()[0]
tstRawImg = smpSbtImg[sbt][smpSbtImg[sbt].keys()[0]]
tstPrcImg = mytransformImage(tstRawImg, retVals = 'final')
nRow = 1; nCol = 2
figs, axes = plt.subplots(nRow, nCol,
figsize=(6 * nCol, 4 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
for j, typImg in enumerate(range(2)):
if (j == 0):
axes[j].imshow(tstRawImg)
axes[j].set_title('raw')
if (j == 1):
if not glbImg['color']:
plt.imshow(tstPrcImg, cmap = 'gray')
else:
plt.imshow(tstPrcImg)
axes[j].set_title('fnl')
plt.show()
tstPrcImg = mytransformImage(tstRawImg, retVals = 'each')
nRow = 1; nCol = 2
figs, axes = plt.subplots(nRow, nCol,
figsize=(6 * nCol, 4 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
for j, typImg in enumerate(range(2)):
if (j == 0):
axes[j].imshow(tstRawImg)
axes[j].set_title('raw')
if (j == 1):
if not glbImg['color']:
plt.imshow(tstPrcImg['fnl'], cmap = 'gray')
else:
plt.imshow(tstPrcImg['fnl'])
axes[j].set_title('fnl')
nRow = 1; nCol = len(tstPrcImg.values()) - 2
figs, axes = plt.subplots(nRow, nCol,
figsize=(6 * nCol, 4 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
for j, typImg in enumerate(list(set(tstPrcImg.keys()) - set(['raw', 'fnl']))):
if (typImg == 'gry'):
axes[j].imshow(tstPrcImg[typImg], cmap = 'gray')
else:
axes[j].imshow(tstPrcImg[typImg])
axes[j].set_title(typImg)
smpSbt0Img = smpSbtImg[smpSbtImg.keys()[0]]
smpPrcImg = {}
for key, value in smpSbt0Img.items():
smpPrcImg[smpSbtImg.keys()[0] + ':' + key] = value
print 'smpPrcImg.keys(): %s' % (smpPrcImg.keys())
for key, raw in smpPrcImg.items():
prcImgDct = mytransformImage(raw, retVals = 'each')
smpPrcImg[key] = prcImgDct
# Ideally 'fnl' should be the last col in the plot
nRow = len(smpPrcImg.keys()); nCol = len(smpPrcImg.values()[0].keys())
# print 'nRow: %d; nCol: %d' % (nRow, nCol)
figs, axes = plt.subplots(nRow, nCol,
figsize=(6 * nCol, 4 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
for i, sbtClsImgFnm in enumerate(smpPrcImg.keys()):
for j, typImg in enumerate(smpPrcImg[sbtClsImgFnm].keys()):
if ((typImg == 'gry') or
((typImg == 'fnl') and ('gry' in smpPrcImg[sbtClsImgFnm].keys()))):
if (nRow > 1):
axes[i, j].imshow(smpPrcImg[sbtClsImgFnm][typImg], cmap = 'gray')
else:
axes[j].imshow(smpPrcImg[sbtClsImgFnm][typImg], cmap = 'gray')
else:
if (nRow > 1):
axes[i, j].imshow(smpPrcImg[sbtClsImgFnm][typImg])
else:
axes[j].imshow(smpPrcImg[sbtClsImgFnm][typImg])
if (nRow > 1):
axes[i, j].set_title(sbtClsImgFnm + ':' + typImg)
else:
axes[j].set_title(sbtClsImgFnm + ':' + typImg)
onlyfiles = [f for f in os.listdir(newFoldersPth)
if os.path.isfile(os.path.join(newFoldersPth, f))]
# print onlyfiles[:5]
smpNewImg = {}; smpN = 3
# print smpN ** 2
# print np.random.randint(0, len(onlyfiles), size = smpN ** 2)
for imgFnm in [onlyfiles[ix]
for ix in np.random.randint(0, len(onlyfiles), size = smpN ** 2)]:
# print ' imgFnm: %s' % (imgFnm)
# img = sk_io.imread(newFoldersPth + '/' + imgFnm)
# assert img.shape == (480, 640, 3), 'img.shape: %s' % (img.shape)
smpNewImg[imgFnm] = myreadImage(newFoldersPth + '/' + imgFnm)
nRow = smpN; nCol = smpN
figs, axes = plt.subplots(nRow, nCol,
figsize=(6 * nCol, 5 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
for i, imgFnm in enumerate(smpNewImg.keys()):
axes[i / nCol, i % nCol].imshow(smpNewImg[imgFnm])
axes[i / nCol, i % nCol].set_title(imgFnm)
Each exemplar should be an image of a character A through J rendered in a different font.
# Display sample train images
# train_folders_path = '/Users/bbalaji-2012/Documents/Work/Courses/Udacity/DeepLearning/code/tensorflow/examples/udacity/data/notMNIST_large/'
# glbImg['size'] = 28
# display(Image(train_folders_path + 'A/a2F6b28udHRm.png', \
# width = glbImg['size'] * 4, height = glbImg['size'] * 4))
# display(Image(train_folders_path + 'B/bnVuaS50dGY=.png', \
# width = glbImg['size'] * 4, height = glbImg['size'] * 4))
# display(Image(train_folders_path + 'C/cmlzay50dGY=.png', \
# width = glbImg['size'] * 4, height = glbImg['size'] * 4))
Now let's load the data in a more manageable format.
We'll convert the entire dataset into a 3D array (image index, x, y) of floating point values, normalized to have approximately zero mean (notMNIST only: and standard deviation ~0.5) to make training easier down the road. The labels will be stored into a separate array (notMNINST only: of integers 0 through 9.)
A few images might not be readable, we'll just skip them.
trnFolders = os.getcwd() + '/data/' + glbDataFile['trnFoldersPth']
trnFolders = [trnFolders + '/' + cls for cls in glbRspClass]
print 'trnFolders: %s' % (trnFolders)
newFolders = [os.getcwd() + '/data/' + glbDataFile['newFoldersPth']]
print 'newFolders: %s' % (newFolders)
# data_folders_path = '/Users/bbalaji-2012/Documents/Work/Courses/Udacity/DeepLearning/code/tensorflow/examples/udacity/data/'
# train_folders = [data_folders_path + 'notMNIST_large/' + d \
# for d in sorted(os.listdir(data_folders_path + 'notMNIST_large/')) \
# if d != '.DS_Store']
# print train_folders
# test_folders = [data_folders_path + 'notMNIST_small/' + d \
# for d in sorted(os.listdir(data_folders_path + 'notMNIST_small/')) \
# if d != '.DS_Store']
# print test_folders
#from scipy import misc as sp_misc
def load(idClass, folderPth, nImgMax, maxCheck = True, verbose = False):
assert isinstance(idClass, str), \
'expecting type(idClass) as str, not %s' % (type(idClass))
assert isinstance(folderPth, str), \
'expecting type(folderPth) as str, not %s' % (type(folderPth))
assert nImgMax > 0, \
'nImgMax: %d has to be > 0' % (nImgMax)
assert isinstance(maxCheck, bool), \
'expecting type(maxCheck) as bool, not %s' % (type(maxCheck))
startTm = datetime.now()
ids = ['' for ix in xrange(nImgMax)]
dataset = np.ndarray(
shape=(nImgMax, glbImg['size'], glbImg['size']), dtype=np.float32)
labels = np.ndarray(shape=(nImgMax), dtype=np.int32)
# label_index = 0
try:
labelsVal = glbRspClass.index(idClass)
except ValueError, e:
print 'unknown class: %s; defaulting label to -1' % (idClass)
labelsVal = -1
except Exception, e:
print(e)
raise
labels[:] = labelsVal
image_index = 0
# if isinstance(data_folders, str):
# data_folders = [data_folders]
# for fldrIx, folder in enumerate(data_folders):
print 'Class: %s; Folder: %s' % (idClass, folderPth)
# print(os.listdir(folder)[:6])
for image in os.listdir(folderPth):
# print(image)
# print((image_index >= (nImgMax / len(data_folders) * (fldrIx + 1))))
if maxCheck and (image_index >= nImgMax):
raise Exception('More images than expected: %d >= %d' % (
image_index, nImgMax))
# elif (image_index >= (nImgMax / len(data_folders) * (fldrIx + 1))):
elif image_index >= nImgMax: break
image_file = os.path.join(folderPth, image)
try:
rawImg = myreadImage(image_file)
except IOError as e:
print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
next
prcImg = mytransformImage(rawImg, retVals = 'final')
# try:
# rsz_image_data = sp_misc.imresize(ndimage.imread(image_file, flatten = not glbImgColor),
# (glbImg['size'], glbImg['size']))
# image_data = (rsz_image_data.astype(float) -
# glbImgPixelDepth / 2) / glbImgPixelDepth
# if image_data.shape != (glbImg['size'], glbImg['size']):
# raise Exception('Unexpected image shape: %s' % str(image_data.shape))
ids[image_index] = image
dataset[image_index, :, :] = prcImg
# labels[image_index] = label_index
if mydspVerboseTrigger(image_index):
# print ' image_index: %d; %s:' % (image_index, image)
print ' image_index: %5d (%5d secs)' % \
(image_index, (datetime.now() - startTm).seconds)
if verbose:
nRow = 1; nCol = 2
figs, axes = plt.subplots(nRow, nCol,
figsize=(6 * nCol, 4 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off'))
for ax in axes.flatten()]
for j, typImg in enumerate(range(0, nCol)):
if (j == 0):
axes[j].imshow(rawImg)
# axes[j].set_title(glbRspClass[label_index] + ':' + image + ':raw')
axes[j].set_title(idClass + ':' + image + ':raw')
else:
if not glbImg['color']:
axes[j].imshow(prcImg, cmap = 'gray')
else:
axes[j].imshow(prcImg)
axes[j].set_title('fnl')
# display(sp_misc.toimage(rsz_image_data))
plt.show()
image_index += 1
# label_index += 1
num_images = image_index
ids = ids[0:num_images]
dataset = dataset[0:num_images, :, :]
labels = labels[0:num_images]
# if num_images < min_num_images:
# raise Exception('Many fewer images than expected: %d < %d' % (
# num_images, min_num_images))
print(' Identifiers:', len(ids))
print(' Full dataset tensor:', dataset.shape)
print(' Mean:', np.mean(dataset))
print(' Standard deviation:', np.std(dataset))
print(' Labels:', labels.shape)
print(' Label Knts:'); print(pd.Series(labels).value_counts())
return {'Cls': idClass, 'Dbs': {'Idn': ids, 'Ftr': dataset, 'Rsp': labels}}
smpC5ObsTrnDct = load('c5', trnFolders[5], 25, maxCheck = False, verbose = True)
smpObsNewDct = load('new', newFolders[0], 25, maxCheck = False, verbose = False)
# smqObsTrnIdn, smqObsTrnFtr, smqObsTrnRsp = load(trnFolders, 250,
# max_check = False)
# print smpObsTrnRsp.value_counts()
# print smpObsTrnIdn[10:15]
# glbObsTrnIdn, glbObsTrnFtr, glbObsTrnRsp = load(trnFolders, 22435)
thsBgnTm = datetime.now()
smqObsTrnLst = []
# for cls in glbRspClass[-2:]:
for cls in glbRspClass:
smqClsObsTrnDct = load(cls, trnFolders[glbRspClass.index(cls)], 25,
maxCheck = False, verbose = False)
smqObsTrnLst.append(smqClsObsTrnDct)
print 'len(smqObsTrnLst): %d' % (len(smqObsTrnLst))
thsDurDff = (datetime.now() - thsBgnTm).seconds
print 'Trn Smp Sequential load duration: %0.2f seconds' % (thsDurDff)
thsBgnTm = datetime.now()
smrObsTrnLst = Parallel(n_jobs = nCores, verbose = 1)(delayed(
load)(cls, trnFolders[glbRspClass.index(cls)], 25,
maxCheck = False, verbose = False) for cls in glbRspClass)
print 'len(smrObsTrnLst): %d' % (len(smrObsTrnLst))
thsDurDff = (datetime.now() - thsBgnTm).seconds
print 'Trn Smp Parallel load duration: %0.2f seconds' % (thsDurDff)
def myisEqualDct(d1, d2):
d1_keys = set(d1.keys())
d2_keys = set(d2.keys())
intersect_keys = d1_keys.intersection(d2_keys)
added = d1_keys - d2_keys
removed = d2_keys - d1_keys
# modified = {o : (d1[o], d2[o]) for o in intersect_keys if d1[o] != d2[o]}
modified = {}
for o in intersect_keys:
if not (isinstance(d1[o], dict)):
try:
eql = d1[o] == d2[o]
# eql = (d1[o] == d2[o]) if not (isinstance(d1[o], dict)) else \
# myisEqualDct(d1[o], d2[o])
except ValueError, e:
print e
print 'key: %s: type:' % (o)
print type(d1[o]).mro()
raise
else: eql = myisEqualDct(d1[o], d2[o])
if not isinstance(eql, bool):
# print 'eql:'; print eql
eql = eql.all()
if not eql: modified[o] = eql
same = set(o for o in intersect_keys if not o in modified.keys())
if (len(added) > 0):
print ' added: %s' % (added)
if (len(removed) > 0):
print ' removed: %s' % (removed)
if (len(modified) > 0):
print ' modified: %s' % (modified)
if (len(same) != len(d1_keys)):
print ' same: %s' % (same)
return ((len(added) == 0) and
(len(removed) == 0) and
(len(modified) == 0) and
(len(same) == len(d2_keys)))
tstAB1Dct = {'a': 1, 'b': 1}; tstAB2Dct = {'a': 1, 'b': 2}
print myisEqualDct(tstAB1Dct, tstAB1Dct)
print myisEqualDct(tstAB1Dct, tstAB2Dct)
tstABC1Dct = {'ab': tstAB1Dct, 'c' : 1};
tstABC2Dct = {'ab': tstAB2Dct, 'c' : 3};
print myisEqualDct(tstABC1Dct, tstABC1Dct)
print myisEqualDct(tstABC1Dct, tstABC2Dct)
print 'len(smqObsTrnLst): %d' % (len(smqObsTrnLst))
print 'len(smrObsTrnLst): %d' % (len(smrObsTrnLst))
for clsIx in range(len(glbRspClass)):
# print 'clsIx: %s' % (clsIx)
# print "type(smqObsTrnLst[clsIx]['Dbs']):"
# print (str(type(smqObsTrnLst[clsIx]['Dbs']).mro()))
# print "type(smqObsTrnLst[clsIx]['Dbs']): %s" \
# (str(type(smqObsTrnLst[clsIx]['Dbs']).mro()))
# print smqObsTrnLst[clsIx]
assert myisEqualDct(smqObsTrnLst[clsIx], smrObsTrnLst[clsIx]), \
'diff in class: %s' % glbRspClass[clsIx]
# print 'numpy.ndarray' in type(smqObsTrnLst[9]['Dbs']['Rsp']).mro()
# print type(smqObsTrnLst[9]['Dbs']['Rsp'])
# print smqObsTrnLst[9]['Dbs']['Rsp'].shape
# print smqObsTrnLst[9]['Dbs']['Rsp']
# print type(smrObsTrnLst[9]['Dbs']['Rsp'])
# print smrObsTrnLst[9]['Dbs']['Rsp'].shape
# print smrObsTrnLst[9]['Dbs']['Rsp']
# print pd.Series(smrObsTrnRsp[9]['Dbs']['Rsp'])
# print pd.Series(smrObsTrnRsp[9]['Dbs']['Rsp']).value_counts()
tstArr = smrObsTrnLst[9]['Dbs']['Rsp']
print pd.Series(tstArr)
def mybuildDatabase(lclObsLst):
# lclObsLst dictionary structure:
# {'Cls': idClass, 'Dbs': {'Idn': ids, 'Ftr': dataset, 'Rsp': labels}
lclObsIdn = []
# assert isinstance(lclObsIdn, list), 'lclObsIdn is not a list'
# print 'type(lclObsIdn): %s' % type(lclObsIdn)
lclObsFtr = lclObsRsp = None
for clsIx in range(len(lclObsLst)):
lclObsIdn.extend(lclObsLst[clsIx]['Dbs']['Idn'])
lclObsFtr = np.vstack((lclObsFtr,
lclObsLst[clsIx]['Dbs']['Ftr'])) \
if not (lclObsFtr == None) else lclObsLst[clsIx]['Dbs']['Ftr']
lclObsRsp = np.hstack((lclObsRsp,
lclObsLst[clsIx]['Dbs']['Rsp'])) \
if not (lclObsRsp == None) else lclObsLst[clsIx]['Dbs']['Rsp']
# print lclObsIdn
return lclObsIdn, lclObsFtr, lclObsRsp
smrObsTrnIdn, smrObsTrnFtr, smrObsTrnRsp = mybuildDatabase(smrObsTrnLst)
print('Identifiers:', len(smrObsTrnIdn))
print('Sample dataset tensor:', smrObsTrnFtr.shape)
print('Mean:', np.mean(smrObsTrnFtr))
print('Standard deviation:', np.std(smrObsTrnFtr))
print('Labels:', smrObsTrnRsp.shape)
# print(smrObsTrnRsp[25:30])
print('Label Knts:'); print(pd.Series(smrObsTrnRsp).value_counts())
thsBgnTm = datetime.now()
glbObsTrnLst = Parallel(n_jobs = nCores, verbose = 1)(delayed(
load)(cls, trnFolders[glbRspClass.index(cls)], 2500,
maxCheck = True, verbose = False) for cls in glbRspClass)
print 'len(glbObsTrnLst): %d' % (len(glbObsTrnLst))
thsDurDff = (datetime.now() - thsBgnTm).seconds
print 'Trn Parallel load duration: %0.2f seconds' % (thsDurDff)
glbObsTrnIdn, glbObsTrnFtr, glbObsTrnRsp = mybuildDatabase(glbObsTrnLst)
We expect the data to be balanced across classes. Verify that.
print('Identifiers:', len(glbObsTrnIdn))
print('Full dataset tensor:', glbObsTrnFtr.shape)
print('Mean:', np.mean(glbObsTrnFtr))
print('Standard deviation:', np.std(glbObsTrnFtr))
print('Labels:', glbObsTrnRsp.shape)
print('Label Knts:'); print(pd.Series(glbObsTrnRsp).value_counts())
Let's verify that the data still looks good. Displaying a sample of the labels and images from the ndarray.
# print type(pd.Series(glbObsTrnRsp).value_counts())
# print pd.Series(glbObsTrnRsp).value_counts().sort_values(ascending = False)
# print pd.Series(glbObsTrnRsp).value_counts().sort_values(ascending = False).iloc[5]
# print pd.Series(glbObsTrnRsp).value_counts().sort_values(ascending = False).index[5]
print type(glbObsTrnRsp)
print glbObsTrnRsp.shape
print glbObsTrnRsp.shape[0]
print glbObsTrnRsp[10000:10005]
print type(range(glbObsTrnRsp.shape[0]))
print range(glbObsTrnRsp.shape[0])[10000:10005]
print (glbObsTrnRsp == 4)[10000:10005]
print type(np.array(range(glbObsTrnRsp.shape[0]))[glbObsTrnRsp == 4])
print np.array(range(glbObsTrnRsp.shape[0]))[glbObsTrnRsp == 4][10:15]
# Revised version down below (to display glbObsNew)
def mydisplayImages(obsIdn, obsFtr, obsRsp):
clsSrs = pd.Series(obsRsp).value_counts().sort_values()
nRow = clsSrs.shape[0]; nCol = 3
figs, axes = plt.subplots(nRow, nCol,
figsize=(6 * nCol, 6 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off'))
for ax in axes.flatten()]
for i, cls in enumerate(clsSrs.index):
obsClsIx = np.array(range(obsRsp.shape[0]))[obsRsp == cls]
# print 'cls: %s' % cls
# print 'obsClsIx[:5]: '; print obsClsIx[:5]
for j, smpIx in enumerate(
np.random.randint(0, len(obsClsIx), nCol)):
if glbImg['color']:
axes[i, j].imshow(obsFtr[obsClsIx[smpIx], :, :])
else:
axes[i, j].imshow(obsFtr[obsClsIx[smpIx], :, :], cmap = 'gray')
axes[i, j].set_title(glbRspClass[cls] + ':' +
obsIdn[obsClsIx[smpIx]])
plt.show()
# imgIxLst = np.random.random_integers(0, obsFtr.shape[0] - 1, 10)
# for imgIx in imgIxLst:
# if (obsRsp[imgIx] > -1):
# print ' imgIx: %d; id: %s; label: %s' % \
# (imgIx, obsIdn[imgIx], glbRspClass[obsRsp[imgIx]])
# else:
# print ' imgIx: %d; id: %s; label: None' % (imgIx, obsIdn[imgIx])
# plt.figure
# plt.imshow(obsFtr[imgIx,:,:], cmap = plt.cm.gray)
# plt.show()
# for i, sbt in enumerate(smpSubjects):
# smpSbtDbImg = driverDf[driverDf.subject == sbt]
# for j, imgDesc in enumerate(range(smpNImg)):
# ixIdn = glbObsTrnIdn.index(smpSbtDbImg.iloc[j]['img'])
# if glbImg['color']:
# axes[i, j].imshow(glbObsTrnFtr[ixIdn, :, :])
# else:
# axes[i, j].imshow(glbObsTrnFtr[ixIdn, :, :], cmap = 'gray')
# axes[i, j].set_title(sbt + ':' + glbRspClass[glbObsTrnRsp[ixIdn]] +
# ':' + driverDf.img[j])
print 'glbObsTrn set:';
mydisplayImages(glbObsTrnIdn, glbObsTrnFtr, glbObsTrnRsp)
plt.imshow(myreadImage(trnFoldersPth + '/c8/img_26672.jpg'))
# plt.imshow(myreadImage(trnFoldersPth + '/c9/img_60822.jpg'))
# plt.imshow(myreadImage(trnFoldersPth + '/c9/img_89196.jpg'))
print driverDf.head()
print driverDf[driverDf.subject == 'p014'].head()
print driverDf[driverDf.img == 'img_44733.jpg'].head()
# smpSubjects selected to match
# https://github.com/ottogroup/statefarm/blob/master/statefarm_getting_started.ipynb
smpSubjects = []
# Updated function defined later ???
def lclDisplaySubjectSampleImages(smpSubjects, lclObsIdn, lclObsFtr, lclObsRsp):
smpNImg = 3
nRow = len(smpSubjects); nCol = smpNImg
figs, axes = plt.subplots(nRow, nCol,
figsize=(6 * nCol, 6 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off'))
for ax in axes.flatten()]
for i, sbt in enumerate(smpSubjects):
smpSbtDbImg = driverDf[driverDf.subject == sbt]
# Select samples from different classes for each subject
smpCls = [glbRspClass[clsIx]
for clsIx in np.random.randint(0, glbRspClassN, smpNImg)]
# print 'smpCls: '; print smpCls
for j, cls in enumerate(smpCls):
smpSbtClsDbImg = smpSbtDbImg[smpSbtDbImg.classname == cls]
# print 'sbt: %s; cls: %s; smpSbtClsDbImg.shape:' % (sbt, cls)
# print smpSbtClsDbImg.shape
# print smpSbtClsDbImg.columns
# print np.random.randint(0, smpSbtClsDbImg.shape[0], 1)[0]
# print smpSbtClsDbImg.iloc[0]['img']
ixIdn = lclObsIdn.index(smpSbtClsDbImg.iloc[
np.random.randint(0, smpSbtClsDbImg.shape[0], 1)[0]]['img'])
if glbImg['color']:
axes[i, j].imshow(lclObsFtr[ixIdn, :, :])
else:
axes[i, j].imshow(lclObsFtr[ixIdn, :, :], cmap = 'gray')
# axes[i, j].set_title(sbt + ':' + cls +
# ':' + smpSbtClsDbImg.iloc[0]['img'])
axes[i, j].set_title(sbt + ':' +
glbRspClass[lclObsRsp[ixIdn]] + ':' +
lclObsIdn[ixIdn])
plt.show()
lclDisplaySubjectSampleImages(['p002', 'p012', 'p014', 'p015', 'p016'],
glbObsTrnIdn, glbObsTrnFtr, glbObsTrnRsp)
plt.imshow(myreadImage(trnFoldersPth + '/c9/img_71334.jpg'))
plt.imshow(myreadImage(trnFoldersPth + '/c7/img_73378.jpg'))
plt.imshow(myreadImage(trnFoldersPth + '/c9/img_79944.jpg'))
plt.imshow(myreadImage(trnFoldersPth + '/c9/img_92682.jpg'))
# plt.imshow(myreadImage(trnFoldersPth + '/c5/img_93438.jpg'))
# Modified above to display sample images from different classes
# smpSubjects selected to match
# https://github.com/ottogroup/statefarm/blob/master/statefarm_getting_started.ipynb
# smpSubjects = []
# def lclDisplaySubjectSampleImages(smpSubjects):
# smpNImg = 3
# nRow = len(smpSubjects); nCol = smpNImg
# figs, axes = plt.subplots(nRow, nCol,
# figsize=(6 * nCol, 6 * nRow))
# [(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
# for i, sbt in enumerate(smpSubjects):
# smpSbtDbImg = driverDf[driverDf.subject == sbt]
# for j, imgDesc in enumerate(range(smpNImg)):
# ixIdn = glbObsTrnIdn.index(smpSbtDbImg.iloc[j]['img'])
# if glbImg['color']:
# axes[i, j].imshow(glbObsTrnFtr[ixIdn, :, :])
# else:
# axes[i, j].imshow(glbObsTrnFtr[ixIdn, :, :], cmap = 'gray')
# axes[i, j].set_title(sbt + ':' + glbRspClass[glbObsTrnRsp[ixIdn]] +
# ':' + driverDf.img[j])
# plt.show()
# lclDisplaySubjectSampleImages(['p002', 'p012', 'p014', 'p015', 'p016'])
# dspLabels = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
# print 'train set:'
# imgIxLst = np.random.random_integers(0, glbObsTrnFtr.shape[0] - 1, 10)
# for imgIx in imgIxLst:
# print 'imgIx: %d: label: %s' % (imgIx, dspLabels[glbObsTrnRsp[imgIx]])
# plt.figure
# plt.imshow(glbObsTrnFtr[imgIx,:,:], cmap = plt.cm.gray)
# plt.show()
Move test images to different folders to parallelize. Change newObsTrnLst to glbObsNewLst
thsBgnTm = datetime.now()
newObsTrnLst = [load('new', newFolders[0], 80000,
maxCheck = True, verbose = True)]
# smpObsNewDct = load('new', newFolders[0], 25, maxCheck = False, verbose = False)
print 'len(newObsTrnLst): %d' % (len(newObsTrnLst))
thsDurDff = (datetime.now() - thsBgnTm).seconds
print 'newObs load duration: %0.2f seconds' % (thsDurDff)
glbObsNewLst = newObsTrnLst
glbObsNewIdn, glbObsNewFtr, glbObsNewRsp = mybuildDatabase(glbObsNewLst)
print('Identifiers:', len(glbObsNewIdn))
print('New Full dataset tensor:', glbObsNewFtr.shape)
print('Mean:', np.mean(glbObsNewFtr))
print('Standard deviation:', np.std(glbObsNewFtr))
print('Labels:', glbObsNewRsp.shape)
print('Label Knts:'); print(pd.Series(glbObsNewRsp).value_counts())
# Add ability to display multiple images if only one class ?
def mydisplayImages(obsIdn, obsFtr, obsRsp):
clsSrs = pd.Series(obsRsp).value_counts().sort_values()
nRow = clsSrs.shape[0]; nCol = 3
figs, axes = plt.subplots(nRow, nCol,
figsize=(6 * nCol, 6 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off'))
for ax in axes.flatten()]
if (len(clsSrs.index) > 1):
for i, cls in enumerate(clsSrs.index):
obsClsIx = np.array(range(obsRsp.shape[0]))[obsRsp == cls]
for j, smpIx in enumerate(
np.random.randint(0, len(obsClsIx), nCol)):
if glbImg['color']:
axes[i, j].imshow(obsFtr[obsClsIx[smpIx], :, :])
else:
axes[i, j].imshow(obsFtr[obsClsIx[smpIx], :, :], cmap = 'gray')
if (cls >= 0):
axes[i, j].set_title(glbRspClass[cls] + ':' +
obsIdn[obsClsIx[smpIx]])
else:
axes[i, j].set_title('new:' +
obsIdn[obsClsIx[smpIx]])
else:
cls = clsSrs.index[0]
obsClsIx = np.array(range(obsRsp.shape[0]))[obsRsp == cls]
for j, smpIx in enumerate(
np.random.randint(0, len(obsClsIx), nCol)):
if glbImg['color']:
axes[j].imshow(obsFtr[obsClsIx[smpIx], :, :])
else:
axes[j].imshow(obsFtr[obsClsIx[smpIx], :, :], cmap = 'gray')
if (cls >= 0):
axes[j].set_title(glbRspClass[cls] + ':' +
obsIdn[obsClsIx[smpIx]])
else:
axes[j].set_title('new:' +
obsIdn[obsClsIx[smpIx]])
plt.show()
print 'glbObsNew set:';
mydisplayImages(glbObsNewIdn, glbObsNewFtr, glbObsNewRsp)
print 'glbObsNew set:';
mydisplayImages(glbObsNewIdn, glbObsNewFtr, glbObsNewRsp)
# smpNImg = 3
# smpObsNewIx = np.random.randint(0, len(glbObsNewIdn), size = smpNImg ** 2)
# smpObsNewIx = smpObsNewIx.reshape((smpNImg, smpNImg))
# # print smpObsNewIx
# nRow = smpNImg; nCol = smpNImg
# figs, axes = plt.subplots(nRow, nCol,
# figsize=(6 * nCol, 6 * nRow))
# [(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off')) for ax in axes.flatten()]
# for i, smpIx in enumerate(range(smpObsNewIx.shape[0])):
# for j, smpJx in enumerate(range(smpObsNewIx.shape[1])):
# if glbImg['color']:
# axes[i, j].imshow(glbObsNewFtr[smpObsNewIx[i, j], :, :])
# else:
# axes[i, j].imshow(glbObsNewFtr[smpObsNewIx[i, j], :, :]
# , cmap = 'gray')
# axes[i, j].set_title('new:' + glbObsNewIdn[smpObsNewIx[i, j]])
# print 'New set:'; mydisplayImages(glbObsNewIdn, glbObsNewFtr, glbObsNewRsp)
# def load(data_folders, min_num_images, nImgMax):
# dataset = np.ndarray(
# shape=(nImgMax, glbImg['size'], glbImg['size']), dtype=np.float32)
# labels = np.ndarray(shape=(nImgMax), dtype=np.int32)
# label_index = 0
# image_index = 0
# for folder in data_folders:
# print(folder)
# for image in os.listdir(folder):
# if image_index >= nImgMax:
# raise Exception('More images than expected: %d >= %d' % (
# image_index, nImgMax))
# image_file = os.path.join(folder, image)
# try:
# image_data = (ndimage.imread(image_file).astype(float) -
# glbImgPixelDepth / 2) / glbImgPixelDepth
# if image_data.shape != (glbImg['size'], glbImg['size']):
# raise Exception('Unexpected image shape: %s' % str(image_data.shape))
# dataset[image_index, :, :] = image_data
# labels[image_index] = label_index
# image_index += 1
# except IOError as e:
# print('Could not read:', image_file, ':', e, '- it\'s ok, skipping.')
# label_index += 1
# num_images = image_index
# dataset = dataset[0:num_images, :, :]
# labels = labels[0:num_images]
# if num_images < min_num_images:
# raise Exception('Many fewer images than expected: %d < %d' % (
# num_images, min_num_images))
# print('Full dataset tensor:', dataset.shape)
# print('Mean:', np.mean(dataset))
# print('Standard deviation:', np.std(dataset))
# print('Labels:', labels.shape)
# return dataset, labels
# glbObsTrnFtr, glbObsTrnRsp = load(train_folders, 450000, 550000)
# glbObsNewFtr, glbObsNewRsp = load(test_folders, 18000, 20000)
#print type(glbObsTrnRsp); print glbObsTrnRsp.shape; print glbObsTrnRsp[0:10]
# print np.sum(glbObsTrnRsp == 0)
# print np.unique(glbObsTrnRsp)
# print 'train labels freqs: %s' % \
# ([np.sum(glbObsTrnRsp == thsLabel) for thsLabel in np.unique(glbObsTrnRsp)])
If # of corrections > 10 ???
# Refer to glbDataScrub
Save imported data.
glbPickleFile
try:
f = open(glbPickleFile['data'], 'wb')
save = {
'glbObsTrnIdn': glbObsTrnIdn,
'glbObsTrnFtr': glbObsTrnFtr,
'glbObsTrnRsp': glbObsTrnRsp,
# 'glbObsVldFtr': glbObsVldFtr,
# 'glbObsVldRsp': glbObsVldRsp,
'glbObsNewIdn': glbObsNewIdn,
'glbObsNewFtr': glbObsNewFtr,
'glbObsNewRsp': glbObsNewRsp,
}
pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
f.close()
except Exception as e:
print('Unable to save data to', glbPickleFile['data'], ':', e)
raise
statinfo = os.stat(glbPickleFile['data'])
print('Compressed Data pickle size:', statinfo.st_size)
with open('data/img_D_SFDD_ImgSz_64_unshuffled.pickle', 'rb') as f:
# with open(glbPickleFile['data'], 'rb') as f:
save = pickle.load(f)
glbObsTrnIdn = save['glbObsTrnIdn']
glbObsTrnFtr = save['glbObsTrnFtr']
glbObsTrnRsp = save['glbObsTrnRsp']
glbObsNewIdn = save['glbObsNewIdn']
glbObsNewFtr = save['glbObsNewFtr']
glbObsNewRsp = save['glbObsNewRsp']
del save # hint to help gc free up memory
print('Trn set:', len(glbObsTrnIdn), glbObsTrnFtr.shape,
glbObsTrnRsp.shape)
print('New set:', len(glbObsNewIdn), glbObsNewFtr.shape,
glbObsNewRsp.shape)
Next, we'll randomize the data. It's important to have the labels well shuffled for the training and test distributions to match.
np.random.seed(glbObsShuffleSeed)
def randomize(ids, dataset, labels):
permutation = np.random.permutation(labels.shape[0])
shuffled_ids = [ids[ix] for ix in permutation]
shuffled_dataset = dataset[permutation,:,:]
shuffled_labels = labels[permutation]
return shuffled_ids, shuffled_dataset, shuffled_labels
glbObsTrnIdn, glbObsTrnFtr, glbObsTrnRsp = \
randomize(glbObsTrnIdn, glbObsTrnFtr, glbObsTrnRsp)
# glbObsNewIdn, glbObsNewFtr, glbObsNewRsp = \
# randomize(glbObsNewIdn, glbObsNewFtr, glbObsNewRsp)
# np.random.seed(133)
# def randomize(dataset, labels):
# permutation = np.random.permutation(labels.shape[0])
# shuffled_dataset = dataset[permutation,:,:]
# shuffled_labels = labels[permutation]
# return shuffled_dataset, shuffled_labels
# glbObsTrnFtr, glbObsTrnRsp = randomize(glbObsTrnFtr, glbObsTrnRsp)
# glbObsNewFtr, glbObsNewRsp = randomize(glbObsNewFtr, glbObsNewRsp)
Check if data is still good after shuffling!
print 'shuffled Trn set:';
mydisplayImages(glbObsTrnIdn, glbObsTrnFtr, glbObsTrnRsp)
# print 'shuffled New set:';
# mydisplayImages(glbObsNewIdn, glbObsNewFtr, glbObsNewRsp)
# plt.imshow(myreadImage(trnFoldersPth + '/c8/img_25438.jpg'))
# plt.imshow(myreadImage(trnFoldersPth + '/c9/img_382.jpg')) # Debatable
Prune the training data as needed. Depending on your computer setup, you might not be able to fit it all in memory, and you can tune obsTrnN as needed.
Also create a validation dataset for hyperparameter tuning.
Partition using drivers as a 'group'. Check which subject's images have higher correlation with the test set images.
def mygetCorrObs(xArr, yArr,
xRowsN = None, yRowsN = None, chunkSize = None):
thsBgnTm = datetime.now()
if xRowsN == None: xRowsN = int(xArr.shape[0])
if yRowsN == None: yRowsN = int(yArr.shape[0])
if chunkSize == None: chunkSize = int(min(xRowsN, yRowsN))
else: assert (chunkSize <= xRowsN) or (chunkSize <= yRowsN), \
'invalid chunkSize: %d, should be None' + \
' or <= %5d (xRowsN) or <= %5d (yRowsN)' % \
(chunkSize, xRowsN, yRowsN)
# assert chunkSize < 10000, \
# 'chunkSize: %d too large; terminating ...' % (chunkSize)
xMtx = np.reshape(xArr[:xRowsN],
(xRowsN, xArr.shape[1] * xArr.shape[2]))
yMtx = np.reshape(yArr[:yRowsN],
(yRowsN, yArr.shape[1] * yArr.shape[2]))
corMtx = np.zeros((xRowsN, yRowsN))
corMtx[:,:] = np.nan
for rowIx in xrange(0, int(xRowsN), int(chunkSize)):
if (rowIx + chunkSize > xRowsN): break
if ((datetime.now() - thsBgnTm).seconds > 60) and \
mydspVerboseTrigger(rowIx):
print ' (at %5d secs) chunkSize: %5d; rowIx: %5d' % \
((datetime.now() - thsBgnTm).seconds, chunkSize, rowIx)
for colIx in xrange(0, int(yRowsN), int(chunkSize)):
if (colIx + chunkSize > yRowsN): break
corMtx[(rowIx):(rowIx + chunkSize),
(colIx):(colIx + chunkSize)] = \
np.corrcoef(xMtx[(rowIx):(rowIx + chunkSize), :],
yMtx[(colIx):(colIx + chunkSize), :])[:chunkSize,
chunkSize:]
if (chunkSize == 1): pass
else:
# x Boundary condition
dffXIx = xRowsN % chunkSize
dffYIx = yRowsN % chunkSize
if (dffXIx > 0):
if ((datetime.now() - thsBgnTm).seconds > 60):
print ' (at %5d secs) chunkSize: %5d; dffXIx: %5d' % \
((datetime.now() - thsBgnTm).seconds, chunkSize, dffXIx)
assert dffXIx < 10000, \
'dffXIx: %d too large; terminating ...' % (dffXIx)
for colIx in xrange(0, int(yRowsN), int(chunkSize)):
if (colIx + chunkSize > yRowsN): break
# print ' dffXIx: xArr Rows: (%5d:%5d); yArr Rows: (%5d:%5d)' % \
# ((xRowsN - dffXIx), (xRowsN), (colIx), (colIx + chunkSize))
corMtx[(xRowsN - dffXIx):(xRowsN),
(colIx):(colIx + chunkSize)] = \
np.corrcoef(xMtx[(xRowsN - dffXIx):(xRowsN), :],
yMtx[(colIx):(colIx + chunkSize), :])[
:dffXIx, dffXIx:]
# y Boundary condition
if (dffYIx > 0):
# assert True, 'mygetCorrObs: not implemented yet for dffYIx > 0'
if ((datetime.now() - thsBgnTm).seconds > 60):
print ' (at %5d secs) chunkSize: %5d; dffYIx: %5d' % \
((datetime.now() - thsBgnTm).seconds, chunkSize, dffYIx)
assert dffYIx < 10000, \
'dffYIx: %d too large; terminating ...' % (dffYIx)
for rowIx in xrange(0, int(xRowsN), int(chunkSize)):
if (rowIx + chunkSize > xRowsN): break
# print ' dffYIx: xArr Rows: (%5d:%5d); yArr Rows: (%5d:%5d)' % \
# ((rowIx), (rowIx + chunkSize), (yRowsN - dffYIx), (yRowsN))
corMtx[(rowIx):(rowIx + chunkSize),
(yRowsN - dffYIx):(yRowsN)] = \
np.corrcoef(xMtx[(rowIx):(rowIx + chunkSize), :],
yMtx[(yRowsN - dffYIx):(yRowsN), :])[
:chunkSize, chunkSize:]
# x & y Boundary condition
if (dffXIx > 0) or (dffYIx > 0):
if ((datetime.now() - thsBgnTm).seconds > 60):
print ' (at %5d secs) chunkSize: 1; xRowsN - dffXIx: %5d; yRowsN - dffYIx: %5d' % \
((datetime.now() - thsBgnTm).seconds, xRowsN - dffXIx, yRowsN - dffYIx)
# assert dffXIx * dffYIx < 10000, \
# 'dffXIx*YIx: %d too large; dffXIx: %d; dffYIx: %d; terminating...' % \
# (dffXIx * dffYIx, dffXIx, dffYIx)
for rowIx in xrange(int(xRowsN - dffXIx), int(xRowsN)):
thsDrn = (datetime.now() - thsBgnTm).seconds
if (thsDrn > 60) and \
mydspVerboseTrigger(rowIx):
print ' (at %d secs) chunkSize: 1; rowIx: %5d' % (thsDrn, rowIx)
for colIx in xrange(int(yRowsN - dffYIx), int(yRowsN)):
corMtx[rowIx:(rowIx + 1), colIx:(colIx + 1)] = \
np.corrcoef(xMtx[rowIx:(rowIx + 1), :],
yMtx[colIx:(colIx + 1), :])[:1, 1:]
assert (corMtx[:,:] != np.nan).all(), 'some cells in corMtx == nan'
return pd.DataFrame({'mean' : np.nanmean(corMtx),
'median': np.nanmedian(corMtx),
'min' : np.nanmin(corMtx),
'max' : np.nanmax(corMtx),
'xRowsN' : xRowsN,
'yRowsN' : yRowsN,
'duration': (datetime.now() - thsBgnTm).seconds,
'chunkSize': chunkSize
}, index = [0])
# print mygetCorrObs(glbObsTrnFtr[:11], glbObsNewFtr[:7], chunkSize = 1)
# print mygetCorrObs(glbObsTrnFtr[:11], glbObsNewFtr[:7], chunkSize = 3)
# print mygetCorrObs(glbObsTrnFtr[:11], glbObsNewFtr[:7], chunkSize = 9)
# print mygetCorrObs(glbObsTrnFtr[:11], glbObsNewFtr[:7])
print mygetCorrObs(glbObsFitFtr[:11], glbObsNewFtr[:7], chunkSize = 1)
print mygetCorrObs(glbObsFitFtr[:11], glbObsNewFtr[:7], chunkSize = 3)
print mygetCorrObs(glbObsFitFtr[:11], glbObsNewFtr[:7], chunkSize = 9)
print mygetCorrObs(glbObsFitFtr[:11], glbObsNewFtr[:7])
# NaN correlation
# print mygetCorrObs(glbObsTrnFtr[:5], glbObsNewFtr[21164:(21164+10)])
# print mygetCorrObs(glbObsTrnFtr[:5], glbObsNewFtr[10000:50000])
# print mygetCorrObs(glbObsTrnFtr[:5], glbObsNewFtr[10000:
# (10000+50000) / 4])
# print mygetCorrObs(glbObsTrnFtr[:5], glbObsNewFtr[
# (10000+50000) / 4:
# (10000+50000) / 2])
# print mygetCorrObs(glbObsTrnFtr[:5], glbObsNewFtr[
# (15000+ 0) / 1:
# (15000+30000) / 2])
# print mygetCorrObs(glbObsTrnFtr[:5], glbObsNewFtr[
# (20625+ 0) / 1:
# (20625+22500) / 2])
# print mygetCorrObs(glbObsTrnFtr[:5], glbObsNewFtr[
# (21093+ 0) / 1:
# (21093+21210) / 2])
print mygetCorrObs(glbObsTrnFtr[:5], glbObsNewFtr[
(21164+ 0) / 1:
( 0+21165) / 1])
print (glbObsNewFtr[21164] == -0.5).all()
tstGetCorrObsDf = None
# tstGetCorrObsDf = pd.DataFrame()
# tstGetCorrObsDf = tstGetCorrObsDf[tstGetCorrObsDf['mean'].notnull()]
%run img_utils.py
srchParamsDct = {
'chunkSize' : [300, 500, 700],
# 'chunkSize' : [1, 5, 10, 100, 200, 1000],
'xRowsN' : [346, 791, 1237],
# 'xRowsN' : [10, 100, 200, 346],
# 346 is min; 1237 is max by subject
'yRowsN' : [79726],
# 'yRowsN' : [100, 1000, 10000, 20000, 50000, 79726],
# 79726 is len(glbObsNewIdn)
}
jnk = mysearchParams(mygetCorrObs, srchParamsDct = srchParamsDct,
curResultsDf = tstGetCorrObsDf,
mode = 'displayonly',
save_filepathname = \
'data/img_01_import_data_SFDD_ImgSz_64_tstGetCorrObsDf.pickle',
xArr = glbObsTrnFtr,
yArr = glbObsNewFtr)
tstGetCorrObsDf = mysearchParams(mygetCorrObs,
srchParamsDct = srchParamsDct,
curResultsDf = tstGetCorrObsDf,
mode = 'run',
sort_values = ['yRowsN', 'xRowsN', 'duration'],
sort_ascending = [False , False , True ],
save_filepathname = \
'data/img_01_import_data_SFDD_ImgSz_64_tstGetCorrObsDf.pickle',
xArr = glbObsTrnFtr,
yArr = glbObsNewFtr)
# print tstGetCorrObsDf
tstGetCorrObsDf['bestFit'] = False
tstGetCorrObsDf.ix[(500.0, 79726.0, 346.0), 'bestFit'] = True
# 1000.0 79726.0 346.0
print tstGetCorrObsDf[tstGetCorrObsDf.yRowsN >= 70000]
robjects.pandas2ri.activate()
pltRDf = robjects.conversion.py2ri(tstGetCorrObsDf[tstGetCorrObsDf.xRowsN >= 346])
# print(pltRDf)
pltRFn = robjects.r("""
source('~/Dropbox/datascience/R/myplot.R')
function(RDf, filename) {
mypltModelStats(RDf, c('mean', 'median', 'duration'),
dim = c('chunkSize', 'yRowsN','xRowsN'),
scaleXFn = NULL,
#highLightIx = which.min(RDf$logLossVld),
highLightIx = which(RDf$bestFit == 'TRUE'),
title = NULL,
fileName = filename)
}
""")
pltRFn(pltRDf, 'img_01_import_data_SFDD_tstGetCorrObsDf.png')
pltRFn = robjects.r("""
source('~/Dropbox/datascience/R/myplot.R')
function(RDf, filename) {
mypltModelStats(RDf, c('mean', 'median'),
dim = c('chunkSize', 'yRowsN','xRowsN'),
scaleXFn = NULL,
#highLightIx = which.min(RDf$logLossVld),
highLightIx = which(RDf$bestFit == 'TRUE'),
title = NULL,
fileName = filename)
}
""")
pltRFn(pltRDf, 'img_01_import_data_SFDD_tstGetCorrObsDf_nodur.png')
tstGetCorrObsDf.to_csv('data/img_01_import_data_SFDD_ImgSz_64_tstGetCorrObsDf.csv')
# def mygetCorrObs(xArr, yArr):
# xRowsN = xArr.shape[0]
# yRowsN = yArr.shape[0]
# corMtx = np.corrcoef(np.reshape(xArr,
# (xArr.shape[0], xArr.shape[1] * xArr.shape[2])),
# np.reshape(yArr,
# (yArr.shape[0], yArr.shape[1] * yArr.shape[2])))
# # print corMtx.shape
# corMtx = corMtx[:xRowsN, xRowsN:]
# # print corMtx
# # print 'corMtx: min: %.4f; max: %.4f; avg: %.4f;' % \
# # (np.min(corMtx), np.max(corMtx), np.mean(corMtx))
# return pd.DataFrame({'mean' : np.mean(corMtx),
# 'median': np.median(corMtx),
# 'min' : np.min(corMtx),
# 'max' : np.max(corMtx),
# 'x.n' : np.shape(xArr)[0],
# 'y.n' : np.shape(yArr)[0],
# }, index = [0])
# print mygetCorrObs(glbObsTrnFtr[:5], glbObsNewFtr[:10])
# drvTrnSbt = driverDf.subject.unique()
drvSbtDf = (pd.DataFrame({'Trn.Images.N': driverDf['subject'].value_counts()})
.sort_values('Trn.Images.N'))
print drvSbtDf
# srhSqnNewCorDf = smpSqnNewCorDf
# srhSqnNewCorDf = pd.DataFrame()
def getSbtNewObsCorrelation(sbt, lclObsTrnIdn, lclObsTrnFtr,
verbose = False):
thsBgnTm = datetime.now()
if verbose:
print 'getSbtNewObsCorrelation: sbt: %s' % (sbt)
tmpSet = set(driverDf[driverDf.subject.isin([sbt])]['img'])
sbtObsTrnIdx = [ix for ix in xrange(len(lclObsTrnIdn))
if lclObsTrnIdn[ix] in
tmpSet]
sbtObsTrnFtr = lclObsTrnFtr[sbtObsTrnIdx,:,:]
# if verbose:
# print ' sbtObsTrnFtr duration: %0.2f seconds' % \
# ((datetime.now() - thsBgnTm).seconds)
thsBgnTm = datetime.now()
corDf = mygetCorrObs(sbtObsTrnFtr, glbObsNewFtr, chunkSize = 500)
corDf['subject'] = sbt
if verbose:
print ' corDf duration: %0.2f seconds' % \
((datetime.now() - thsBgnTm).seconds)
return corDf
thsBgnTm = datetime.now()
retLst = []
for sbt in drvSbtDf.index[:3]:
retLst.append(getSbtNewObsCorrelation(sbt,
glbObsTrnIdn, glbObsTrnFtr,
verbose = True))
smpSqnNewCorDf = pd.DataFrame()
for df in retLst:
smpSqnNewCorDf = smpSqnNewCorDf.append(df)
print 'smp getSbtNewObsCorrelation sequential duration: %0.2f seconds' % \
((datetime.now() - thsBgnTm).seconds)
print smpSqnNewCorDf
Parallel test does not work. crashes all the time
thsBgnTm = datetime.now()
retLst = Parallel(n_jobs = nCores, verbose = 2)(delayed(
getSbtNewObsCorrelation)(sbt, glbObsTrnIdn, glbObsTrnFtr)
for sbt in drvSbtDf.index[:3])
# print retLst
smpPrlNewCorDf = pd.DataFrame()
for df in retLst:
# print 'type(df): %s' % (str(type(df)))
# print 'type(drvNewCorDf): %s' % (str(type(drvNewCorDf)))
smpPrlNewCorDf = smpPrlNewCorDf.append(df)
print 'smp getSbtNewObsCorrelation parallel duration: %0.2f seconds' % \
((datetime.now() - thsBgnTm).seconds)
print smpPrlNewCorDf
assert smpSqnNewCorDf.equals(smpPrlNewCorDf), \
'smpSqnNewCorDf != smpPrlNewCorDf'
thsBgnTm = datetime.now()
retLst = Parallel(n_jobs = nCores, verbose = 2)(delayed(
getSbtNewObsCorrelation)(sbt,
glbObsTrnIdn[:5000],
glbObsTrnFtr[:5000])
for sbt in drvTrnSbt[:3])
# print retLst
smpPrlNewCorDf = pd.DataFrame()
for df in retLst:
# print 'type(df): %s' % (str(type(df)))
# print 'type(drvNewCorDf): %s' % (str(type(drvNewCorDf)))
smpPrlNewCorDf = smpPrlNewCorDf.append(df)
print 'getSbtNewObsCorrelation parallel duration: %0.2f seconds' % \
((datetime.now() - thsBgnTm).seconds)
print smpPrlNewCorDf
thsBgnTm = datetime.now()
retLst = []
for sbt in drvTrnSbt[:3]:
retLst.append(getSbtNewObsCorrelation(sbt,
glbObsTrnIdn[:5000], glbObsTrnFtr[:5000]))
# print retLst
smpSqnNewCorDf = pd.DataFrame()
for df in retLst:
smpSqnNewCorDf = smpSqnNewCorDf.append(df)
print 'getSbtNewObsCorrelation sequential duration: %0.2f seconds' % \
((datetime.now() - thsBgnTm).seconds)
print smpPrlNewCorDf
assert smpSqnNewCorDf.equals(smpPrlNewCorDf), \
'smpSqnNewCorDf != smpPrlNewCorDf'
thsBgnTm = datetime.now()
retLst = Parallel(n_jobs = nCores, verbose = 2)(delayed(
getSbtNewObsCorrelation)(sbt,
glbObsTrnIdn,
glbObsTrnFtr)
for sbt in drvTrnSbt)
drvNewCorDf = pd.DataFrame()
for df in retLst:
drvNewCorDf = drvNewCorDf.append(df)
print 'getSbtNewObsCorrelation parallel duration: %0.2f seconds' % \
((datetime.now() - thsBgnTm).seconds)
print drvNewCorDf
thsBgnTm = datetime.now()
retLst = []
for sbt in drvSbtDf.index:
retLst.append(getSbtNewObsCorrelation(sbt,
glbObsTrnIdn, glbObsTrnFtr,
verbose = True))
sbtNewCorDf = pd.DataFrame()
for df in retLst:
sbtNewCorDf = sbtNewCorDf.append(df)
print 'sbt getSbtNewObsCorrelation sequential duration: %0.2f seconds' % \
((datetime.now() - thsBgnTm).seconds)
print sbtNewCorDf.sort_values('mean', ascending = False)
try:
f = open(glbPickleFile['data'], 'wb')
save = {
'glbObsTrnIdn': glbObsTrnIdn,
'glbObsTrnFtr': glbObsTrnFtr,
'glbObsTrnRsp': glbObsTrnRsp,
# 'glbObsVldFtr': glbObsVldFtr,
# 'glbObsVldRsp': glbObsVldRsp,
'glbObsNewIdn': glbObsNewIdn,
'glbObsNewFtr': glbObsNewFtr,
'glbObsNewRsp': glbObsNewRsp,
'sbtNewCorDf' : sbtNewCorDf
}
pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
f.close()
except Exception as e:
print('Unable to save data to', glbPickleFile['data'], ':', e)
raise
statinfo = os.stat(glbPickleFile['data'])
print('Compressed Data pickle size:', statinfo.st_size)
import ggplot
gp = ggplot.ggplot(ggplot.components.aes(x = 'median', y = 'mean'), data = sbtNewCorDf) + \
ggplot.geom_point(ggplot.components.aes(size = 'xRowsN'), color = 'blue') + \
ggplot.geom_text(ggplot.components.aes(label = 'subject'))
print gp
sbtNewCorDf = sbtNewCorDf.sort_values('mean', ascending = False)
sbtNewCorDf['xRowsN.cum'] = sbtNewCorDf['xRowsN'].cumsum()
sbtNewCorDf['xRowsN.cum.nTrn.Ratio'] = sbtNewCorDf['xRowsN.cum'] * 1.0 / \
sbtNewCorDf['xRowsN'].sum()
print sbtNewCorDf
drvVldSbt = sbtNewCorDf[sbtNewCorDf['xRowsN.cum.nTrn.Ratio'] < 0.2]
print '\ndrvVldSbt:'; print drvVldSbt
# print ~sbtNewCorDf.subject.isin(drvVldSbt['subject'])
drvFitSbt = sbtNewCorDf[~sbtNewCorDf.subject.isin(drvVldSbt['subject'])]
print '\ndrvFitSbt:'; print drvFitSbt
assert len(set(drvVldSbt['subject']).intersection(set(drvFitSbt['subject']))) == 0, \
'drvVldSbt has elements in drvFitSbt'
from sklearn.cross_validation import train_test_split
drvTrnSbt = driverDf.subject.unique()
drvTrnN = drvTrnSbt.shape[0]
drvVldN = int(drvTrnN * 0.2)
drvFitSbt, drvVldSbt = train_test_split(drvTrnSbt,
test_size = drvVldN,
random_state = glbObsShuffleSeed)
drvFitSbt.sort()
drvVldSbt.sort()
print 'Vld subjects obtained: %5d; vs. desired: %5d' % \
(drvVldSbt.shape[0], drvVldN)
print type(drvVldSbt)
print drvVldSbt
print 'Fit subjects obtained: %5d' % \
(drvFitSbt.shape[0])
print driverDf.subject[1000:1005]
# print (driverDf.subject.isin(drvVldSbt)).shape
print driverDf.subject.isin(drvVldSbt)[1000:1005]
# print driverDf[driverDf.subject.isin(drvVldSbt)]['img'][0:5]
print driverDf[driverDf.subject.isin(drvVldSbt)][1000:1005]
print driverDf[driverDf.subject.isin(drvVldSbt)]['img'][1000:1005]
print [glbObsTrnIdn[ix] for ix in xrange(len(glbObsTrnIdn))
if glbObsTrnIdn[ix] in set(driverDf[driverDf.subject.isin(drvVldSbt)]['img'])
][0:5]
# obsTrnN = glbObsTrnFtr.shape[0] # or fixed number e.g. 20000
# obsVldN = int(obsTrnN * 0.2)
# print 'obsTrnN: %d; obsVldN: %d' % (obsTrnN, obsVldN)
tmpVldSbtImgSet = set(driverDf[driverDf.subject.isin(drvVldSbt.subject)]['img'])
# print tmpVldSbtImgSet
tmpObsVldIdx = [ix for ix in xrange(len(glbObsTrnIdn))
if glbObsTrnIdn[ix] in tmpVldSbtImgSet]
glbObsVldIdn = [glbObsTrnIdn[ix] for ix in tmpObsVldIdx]
glbObsVldFtr = glbObsTrnFtr[tmpObsVldIdx,:,:]
glbObsVldRsp = glbObsTrnRsp[tmpObsVldIdx]
# glbObsFitIdn = glbObsTrnIdn[obsVldN:obsVldN+obsTrnN]
# glbObsFitFtr = glbObsTrnFtr[obsVldN:obsVldN+obsTrnN,:,:]
# glbObsFitRsp = glbObsTrnRsp[obsVldN:obsVldN+obsTrnN]
# print(' Fitting:', len(glbObsFitIdn), glbObsFitFtr.shape, glbObsFitRsp.shape)
print('Validation:', len(glbObsVldIdn), glbObsVldFtr.shape, glbObsVldRsp.shape)
print 'Validation / Trn Obs: %.4f' % (len(glbObsVldIdn) * 1.0 / len(glbObsTrnIdn))
tmpFitSbtImgSet = set(driverDf[driverDf.subject.isin(drvFitSbt.subject)]['img'])
# print tmpVldSbtImgSet
tmpObsFitIdx = [ix for ix in xrange(len(glbObsTrnIdn))
if glbObsTrnIdn[ix] in tmpFitSbtImgSet]
glbObsFitIdn = [glbObsTrnIdn[ix] for ix in tmpObsFitIdx]
glbObsFitFtr = glbObsTrnFtr[tmpObsFitIdx,:,:]
glbObsFitRsp = glbObsTrnRsp[tmpObsFitIdx]
# glbObsFitIdn = glbObsTrnIdn[obsVldN:obsVldN+obsTrnN]
# glbObsFitFtr = glbObsTrnFtr[obsVldN:obsVldN+obsTrnN,:,:]
# glbObsFitRsp = glbObsTrnRsp[obsVldN:obsVldN+obsTrnN]
print(' Fitting:', len(glbObsFitIdn), glbObsFitFtr.shape, glbObsFitRsp.shape)
print 'Fit / Trn Obs: %.4f' % (len(glbObsFitIdn) * 1.0 / len(glbObsTrnIdn))
# obsTrnN = glbObsTrnFtr.shape[0]
# #obsTrnN = 200000
# obsVldN = 10000
# glbObsVldFtr = glbObsTrnFtr[:obsVldN,:,:]
# glbObsVldRsp = glbObsTrnRsp[:obsVldN]
# glbObsTrnFtr = glbObsTrnFtr[obsVldN:obsVldN+obsTrnN,:,:]
# glbObsTrnRsp = glbObsTrnRsp[obsVldN:obsVldN+obsTrnN]
# print('Training', glbObsTrnFtr.shape, glbObsTrnRsp.shape)
# print('Validation', glbObsVldFtr.shape, glbObsVldRsp.shape)
print 'glbObsVldRsp class knts & Trn ratios: '
print (np.unique(glbObsVldRsp, return_counts = True))
print (np.unique(glbObsVldRsp, return_counts = True)[1] * 1.0 /
np.unique(glbObsTrnRsp, return_counts = True)[1])
try:
f = open(glbPickleFile['data'], 'wb')
save = {
# 'glbObsTrnIdn': glbObsTrnIdn,
# 'glbObsTrnFtr': glbObsTrnFtr,
# 'glbObsTrnRsp': glbObsTrnRsp,
'glbObsFitIdn': glbObsFitIdn,
'glbObsFitFtr': glbObsFitFtr,
'glbObsFitRsp': glbObsFitRsp,
'glbObsVldIdn': glbObsVldIdn,
'glbObsVldFtr': glbObsVldFtr,
'glbObsVldRsp': glbObsVldRsp,
'glbObsNewIdn': glbObsNewIdn,
'glbObsNewFtr': glbObsNewFtr,
'glbObsNewRsp': glbObsNewRsp,
'sbtNewCorDf' : sbtNewCorDf
}
pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
f.close()
except Exception as e:
print('Unable to save data to', glbPickleFile['data'], ':', e)
raise
statinfo = os.stat(glbPickleFile['data'])
print('Compressed Data pickle size:', statinfo.st_size)
Check images in validation set to ensure that they are from the appropriate subjects
def lclDisplaySubjectSampleImages(smpSubjects, lclObsIdn, lclObsFtr, lclObsRsp):
# print smpSubjects
smpNImg = 3
nRow = len(smpSubjects); nCol = smpNImg
figs, axes = plt.subplots(nRow, nCol,
figsize=(6 * nCol, 6 * nRow))
[(ax.set_xticks([]), ax.set_yticks([]), ax.axis('off'))
for ax in axes.flatten()]
for i, sbt in enumerate(smpSubjects):
smpSbtDbImg = driverDf[driverDf.subject == sbt]
# Select samples from different classes for each subject
smpCls = [glbRspClass[clsIx]
for clsIx in np.random.randint(0, glbRspClassN, smpNImg)]
# print 'smpCls: '; print smpCls
for j, cls in enumerate(smpCls):
smpSbtClsDbImg = smpSbtDbImg[smpSbtDbImg.classname == cls]
# print 'sbt: %s; cls: %s; smpSbtClsDbImg.shape:' % (sbt, cls)
# print smpSbtClsDbImg.shape
# print smpSbtClsDbImg.columns
# print np.random.randint(0, smpSbtClsDbImg.shape[0], 1)[0]
# print smpSbtClsDbImg.iloc[0]['img']
ixIdn = lclObsIdn.index(smpSbtClsDbImg.iloc[
np.random.randint(0, smpSbtClsDbImg.shape[0], 1)[0]]['img'])
if glbImg['color']:
axes[i, j].imshow(lclObsFtr[ixIdn, :, :])
else:
axes[i, j].imshow(lclObsFtr[ixIdn, :, :], cmap = 'gray')
# axes[i, j].set_title(sbt + ':' + cls +
# ':' + smpSbtClsDbImg.iloc[0]['img'])
axes[i, j].set_title(sbt + ':' +
glbRspClass[lclObsRsp[ixIdn]] + ':' +
lclObsIdn[ixIdn])
plt.show()
lclDisplaySubjectSampleImages(drvVldSbt['subject'].unique(),
glbObsVldIdn, glbObsVldFtr, glbObsVldRsp)
plt.imshow(myreadImage(trnFoldersPth + '/c5/img_78504.jpg'))
lclDisplaySubjectSampleImages(drvFitSbt['subject'].unique(),
glbObsFitIdn, glbObsFitFtr, glbObsFitRsp)
plt.imshow(myreadImage(trnFoldersPth + '/c9/img_16428.jpg'))
# plt.imshow(myreadImage(trnFoldersPth + '/c9/img_71047.jpg')) debatable
By construction, this dataset might contain a lot of overlapping samples, including training data that's also contained in the validation and test set! Overlap between training and test can skew the results if you expect to use your model in an environment where there is never an overlap, but are actually ok if you expect to see training samples recur when you use it. Measure how much overlap there is between training, validation and test samples.
Optional questions:
# obsFitSet = set(img.tostring() for img in glbObsFitFtr)
# print 'Fit: shape: %s vs. len(set): %d pctDups: %0.4f' % \
# (glbObsFitFtr.shape, len(obsFitSet), \
# (glbObsFitFtr.shape[0] * 1.0 / len(obsFitSet) - 1) * 100)
obsTrnSet = set(img.tostring() for img in glbObsTrnFtr)
print 'Trn: shape: %s vs. len(set): %5d pctDups: %0.4f' % \
(glbObsTrnFtr.shape, len(obsTrnSet), \
(glbObsTrnFtr.shape[0] * 1.0 / len(obsTrnSet) - 1) * 100)
obsFitSet = set(img.tostring() for img in glbObsFitFtr)
print 'Fit: shape: %s vs. len(set): %5d pctDups: %0.4f' % \
(glbObsFitFtr.shape, len(obsFitSet), \
(glbObsFitFtr.shape[0] * 1.0 / len(obsFitSet) - 1) * 100)
obsVldSet = set(img.tostring() for img in glbObsVldFtr)
print 'Vld: shape: %s vs. len(set): %5d pctDups: %0.4f' % \
(glbObsVldFtr.shape, len(obsVldSet), \
(glbObsVldFtr.shape[0] * 1.0 / len(obsVldSet) - 1) * 100)
obsNewSet = set(img.tostring() for img in glbObsNewFtr)
print 'New: shape: %s vs. len(set): %5d pctDups: %0.4f' % \
(glbObsNewFtr.shape, len(obsNewSet), \
(glbObsNewFtr.shape[0] * 1.0 / len(obsNewSet) - 1) * 100)
# print np.unique(glbObsNewFtr[:5], return_inverse = True)[1].shape
# print np.vstack({tuple(row) for row in glbObsNewFtr[:5]})
tmpObsNewFtr = glbObsNewFtr
# print 'tmpObsNewFtr.shape: %s' % (str(tmpObsNewFtr.shape))
rshObsNewFtr = np.reshape(tmpObsNewFtr,
(tmpObsNewFtr.shape[0],
tmpObsNewFtr.shape[1] * tmpObsNewFtr.shape[2]))
# print 'rshObsNewFtr.shape: %s' % (str(rshObsNewFtr.shape))
# print np.ascontiguousarray(tmpObsNewFtr).shape
conObsNewFtr = np.ascontiguousarray(rshObsNewFtr).view(np.dtype((np.void,
rshObsNewFtr.dtype.itemsize * rshObsNewFtr.shape[1])))
# print conObsNewFtr.shape
# # print conObsNewFtr # This prints gibberish
_, idx = np.unique(conObsNewFtr, return_inverse=True)
# print idx.shape
# print idx
frqObsNewFtr = pd.Series(idx).value_counts()
# print frqObsNewFtr
# print type(frqObsNewFtr[frqObsNewFtr > 1])
print frqObsNewFtr[frqObsNewFtr > 1]
dupsIx = np.where(idx == 4)
print dupsIx
for obsIx in dupsIx[0]:
print obsIx
plt.imshow(myreadImage(newFoldersPth + '/' + glbObsNewIdn[obsIx]))
plt.title('new:' + glbObsNewIdn[obsIx])
plt.show()
print 'Vld set overlap with Fit set: %0.4f' % \
(len(obsVldSet.intersection(obsFitSet)) * 1.0 / len(obsVldSet))
print 'Vld set overlap with New set: %0.4f' % \
(len(obsVldSet.intersection(obsNewSet)) * 1.0 / len(obsNewSet))
print 'Fit set overlap with New set: %0.4f' % \
(len(obsFitSet.intersection(obsNewSet)) * 1.0 / len(obsFitSet))
print mygetCorrObs(glbObsVldFtr, glbObsFitFtr, chunkSize = 1000)
print mygetCorrObs(glbObsVldFtr, glbObsNewFtr, chunkSize = 1000)
print sbtNewCorDf[:5]['mean']
print sbtNewCorDf[:5]['xRowsN']
print np.average(sbtNewCorDf[:5]['mean'],
weights = sbtNewCorDf[:5]['xRowsN'])
print mygetCorrObs(glbObsFitFtr[:1000], glbObsNewFtr, chunkSize = 500)
print mygetCorrObs(glbObsFitFtr[:1000], glbObsNewFtr, chunkSize = 5000)
print mygetCorrObs(glbObsFitFtr[:1000], glbObsNewFtr, chunkSize = 1000)
print mygetCorrObs(glbObsFitFtr[:2000], glbObsNewFtr, chunkSize = 1000)
print mygetCorrObs(glbObsFitFtr[:2000], glbObsNewFtr, chunkSize = 3000)
print mygetCorrObs(glbObsFitFtr[:2000], glbObsNewFtr, chunkSize = 2000)
print mygetCorrObs(glbObsFitFtr[:5000], glbObsNewFtr, chunkSize = 2000)
print mygetCorrObs(glbObsFitFtr[:5000], glbObsNewFtr, chunkSize = 500)
print mygetCorrObs(glbObsFitFtr[:5000], glbObsNewFtr, chunkSize = 1000)
print mygetCorrObs(glbObsFitFtr[:10000], glbObsNewFtr, chunkSize = 1000)
# print mygetCorrObs(glbObsFitFtr, glbObsNewFtr, chunkSize = 1000)
with open(glbPickleFile['data'], 'rb') as f:
save = pickle.load(f)
# glbObsTrnIdn = save['glbObsTrnIdn']
# glbObsTrnFtr = save['glbObsTrnFtr']
# glbObsTrnRsp = save['glbObsTrnRsp']
glbObsFitIdn = save['glbObsFitIdn']
glbObsFitFtr = save['glbObsFitFtr']
glbObsFitRsp = save['glbObsFitRsp']
glbObsVldIdn = save['glbObsVldIdn']
glbObsVldFtr = save['glbObsVldFtr']
glbObsVldRsp = save['glbObsVldRsp']
glbObsNewIdn = save['glbObsNewIdn']
glbObsNewFtr = save['glbObsNewFtr']
glbObsNewRsp = save['glbObsNewRsp']
sbtNewCorDf = save['sbtNewCorDf']
del save # hint to help gc free up memory
print('Fit set:', len(glbObsFitIdn), glbObsFitFtr.shape,
glbObsFitRsp.shape)
print('Vld set:', len(glbObsVldIdn), glbObsVldFtr.shape,
glbObsVldRsp.shape)
print('New set:', len(glbObsNewIdn), glbObsNewFtr.shape,
glbObsNewRsp.shape)
print '\nsbtNewCorDf:'; print (sbtNewCorDf.head())
print mygetCorrObs(glbObsFitFtr[
np.random.permutation(glbObsFitFtr.shape[0])[
np.random.randint(0, glbObsFitFtr.shape[0], 100)
]],
glbObsNewFtr, chunkSize = 1000)
print mygetCorrObs(glbObsFitFtr[
np.random.permutation(glbObsFitFtr.shape[0])[
np.random.randint(0, glbObsFitFtr.shape[0], 1000)
]],
glbObsNewFtr, chunkSize = 1000)
print mygetCorrObs(glbObsFitFtr[
np.random.permutation(glbObsFitFtr.shape[0])[:10000]],
glbObsNewFtr, chunkSize = 1000)
print mygetCorrObs(glbObsFitFtr,
glbObsNewFtr, chunkSize = 1000)
Finally, let's save the data for later reuse:
Remember to save previous pickled file as '_unshuffled'
# glbPickleFile = os.getcwd() + '/data/notMNIST.pickle'
# print glbPickleFile
# try:
# f = open('data/' + glbPickleFile, 'wb')
# save = {
# 'glbObsTrnIdn': glbObsTrnIdn,
# 'glbObsTrnFtr': glbObsTrnFtr,
# 'glbObsTrnRsp': glbObsTrnRsp,
# 'glbObsFitIdn': glbObsFitIdn,
# 'glbObsFitFtr': glbObsFitFtr,
# 'glbObsFitRsp': glbObsFitRsp,
# 'glbObsVldIdn': glbObsVldIdn,
# 'glbObsVldFtr': glbObsVldFtr,
# 'glbObsVldRsp': glbObsVldRsp,
# 'glbObsNewIdn': glbObsNewIdn,
# 'glbObsNewFtr': glbObsNewFtr,
# 'glbObsNewRsp': glbObsNewRsp,
# 'sbtNewCorDf' : sbtNewCorDf
# }
# pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
# f.close()
# except Exception as e:
# print('Unable to save data to', glbPickleFile, ':', e)
# raise
# statinfo = os.stat('data/' + glbPickleFile)
# print('Compressed pickle size:', statinfo.st_size)
# print glbObsTrnFtr[0:3]
# print np.ascontiguousarray(glbObsTrnFtr[0:3])
# print np.ascontiguousarray(glbObsTrnFtr[0:3]).shape
# obsFitSet = set(img.tostring() for img in glbObsFitFtr)
# print 'Fit: shape: %s vs. len(set): %d pctDups: %0.4f' % \
# (glbObsFitFtr.shape, len(obsFitSet), \
# (glbObsFitFtr.shape[0] * 1.0 / len(obsFitSet) - 1) * 100)
# obsVldSet = set(img.tostring() for img in glbObsVldFtr)
# print 'Vld: shape: %s vs. len(set): %d pctDups: %0.4f' % \
# (glbObsVldFtr.shape, len(obsVldSet), \
# (glbObsVldFtr.shape[0] * 1.0 / len(obsVldSet) - 1) * 100)
# obsNewSet = set(img.tostring() for img in glbObsNewFtr)
# print 'New: shape: %s vs. len(set): %d pctDups: %0.4f' % \
# (glbObsNewFtr.shape, len(obsNewSet), \
# (glbObsNewFtr.shape[0] * 1.0 / len(obsNewSet) - 1) * 100)
#print glbObsTrnFtr[0:3]
# obsFitSet = set(img.tostring() for img in glbObsTrnFtr)
# print 'train: shape: %s vs. len(set): %d pctDups: %0.4f' % \
# (glbObsTrnFtr.shape, len(obsFitSet), \
# (glbObsTrnFtr.shape[0] * 1.0 / len(obsFitSet) - 1) * 100)
# validSet = set(img.tostring() for img in glbObsVldFtr)
# print 'valid: shape: %s vs. len(set): %d pctDups: %0.4f' % \
# (glbObsVldFtr.shape, len(validSet), \
# (glbObsVldFtr.shape[0] * 1.0 / len(validSet) - 1) * 100)
# obsNewSet = set(img.tostring() for img in glbObsNewFtr)
# print 'test : shape: %s vs. len(set): %d pctDups: %0.4f' % \
# (glbObsNewFtr.shape, len(obsNewSet), \
# (glbObsNewFtr.shape[0] * 1.0 / len(obsNewSet) - 1) * 100)
# print 'Vld set overlap with Fit set: %0.4f' % \
# (len(obsVldSet.intersection(obsFitSet)) * 1.0 / len(obsVldSet))
# print 'Vld set overlap with New set: %0.4f' % \
# (len(obsVldSet.intersection(obsNewSet)) * 1.0 / len(obsNewSet))
# print 'Fit set overlap with New set: %0.4f' % \
# (len(obsFitSet.intersection(obsNewSet)) * 1.0 / len(obsFitSet))
# print ' test set overlap with train set: %0.4f' % \
# (len( obsNewSet.intersection(obsFitSet)) * 1.0 / len( obsNewSet))
# print 'valid set overlap with test set: %0.4f' % \
# (len(validSet.intersection( obsNewSet)) * 1.0 / len(validSet))
Following code is in img_02_fit_lgtRgr_SFDD
Let's get an idea of what an off-the-shelf classifier can give you on this data. It's always good to check that there is something to learn, and that it's a problem that is not so trivial that a canned solution solves it.
Train a simple model on this data using 50, 100, 1000 and 5000 training samples. Hint: you can use the LogisticRegression model from sklearn.linear_model.
Optional question: train an off-the-shelf model on all the data!
# import graphlab
# print graphlab.version
# graphlab.canvas.set_target('ipynb')
# graphlab.logistic_classifier.create(image_train,target='label',
# features=['image_array'])
print glbObsTrnFtr[0:3,:,:]
print np.reshape(glbObsTrnFtr[0:3,:,:], (3, glbObsTrnFtr.shape[1] * glbObsTrnFtr.shape[2]))
print np.reshape(glbObsTrnFtr[0:3,:,:], (3, glbObsTrnFtr.shape[1] * glbObsTrnFtr.shape[2])).shape
from sklearn import metrics, linear_model
import pandas as pd
def fitMdl(nFitObs = 50):
mdl = linear_model.LogisticRegression(verbose = 1)
mdl.fit(np.reshape(glbObsTrnFtr[0:nFitObs,:,:], \
(nFitObs, glbObsTrnFtr.shape[1] * glbObsTrnFtr.shape[2])), \
glbObsTrnRsp[0:nFitObs])
print mdl.get_params()
print mdl.coef_.shape
print ' coeff stats:'
for lblIx in xrange(len(dspLabels)):
print ' label:%s; minCoeff:row:%2d, col:%2d, value:%0.4f; maxCoeff:row:%2d, col:%2d, value:%0.4f;' % \
(dspLabels[lblIx], \
mdl.coef_[lblIx,:].argmin() / glbImg['size'], \
mdl.coef_[lblIx,:].argmin() % glbImg['size'], \
mdl.coef_[lblIx,:].min(), \
mdl.coef_[lblIx,:].argmax() / glbImg['size'], \
mdl.coef_[lblIx,:].argmax() % glbImg['size'], \
mdl.coef_[lblIx,:].max())
train_pred_labels = mdl.predict(np.reshape(glbObsTrnFtr[0:nFitObs,:,:], \
(nFitObs , glbImg['size'] ** 2)))
accuracy_train = metrics.accuracy_score(train_pred_labels, glbObsTrnRsp[0:nFitObs])
print ' accuracy train:%0.4f' % (accuracy_train)
print metrics.confusion_matrix(glbObsTrnRsp[0:nFitObs], train_pred_labels)
valid_pred_labels = mdl.predict(np.reshape(glbObsVldFtr, \
(glbObsVldFtr.shape[0], glbImg['size'] ** 2)))
accuracy_valid = metrics.accuracy_score(valid_pred_labels, glbObsVldRsp)
print ' accuracy valid:%0.4f' % (accuracy_valid)
print metrics.confusion_matrix(glbObsVldRsp , valid_pred_labels)
test_pred_labels = mdl.predict(np.reshape(glbObsNewFtr, \
(glbObsNewFtr.shape[0], glbImg['size'] ** 2)))
accuracy_test = metrics.accuracy_score( test_pred_labels, glbObsNewRsp)
print ' accuracy test:%0.4f' % (accuracy_test)
test_conf = pd.DataFrame(metrics.confusion_matrix( glbObsNewRsp, test_pred_labels), \
index = dspLabels, columns = dspLabels)
print test_conf
return(mdl, (accuracy_train, accuracy_valid, accuracy_test))
mdl50 = fitMdl(nFitObs = 50)
models = pd.DataFrame({'nFitObs': [1e2, 1e3, 1e4, 1e5, glbObsTrnFtr.shape[0]]})
models = models.set_index(models['nFitObs'])
models['mdl'] = linear_model.LogisticRegression()
models['accuracy.fit'] = -1; models['accuracy.vld'] = -1; models['accuracy.new'] = -1
for thsN in models['nFitObs']:
models.ix[thsN, 'mdl'], (models.ix[thsN, 'accuracy.fit'], \
models.ix[thsN, 'accuracy.vld'], \
models.ix[thsN, 'accuracy.new'], \
) = fitMdl(nFitObs = thsN)
print models
plt.figure()
plt.plot(models['nFitObs'], models['accuracy.fit'], 'bo-', label = 'fit')
plt.plot(models['nFitObs'], models['accuracy.vld'], 'rs-', label = 'vld')
plt.plot(models['nFitObs'], models['accuracy.new'], 'gp-', label = 'new')
plt.legend()
plt.title("Accuracy")
plt.xscale('log')
axes = plt.gca()
axes.set_xlabel('nFitObs')
# axes.set_xlim([mdlDF['l1_penalty'][mdlDF['RSS.vld'].argmin()] / 10 ** 2, \
# mdlDF['l1_penalty'][mdlDF['RSS.vld'].argmin()] * 10 ** 2])
# axes.set_ylim([0, mdlDF['RSS.vld'].min() * 1.5])
plt.show()
print dspLabels
import pandas as pd